import numpy as np
import pandas as pd
from statsmodels.discrete.discrete_model import Probit
from sklearn.preprocessing import scale 
import Functions.Miscellaneous.utils as ut




def disclosure_probability(X, dim):
    disclosure_prob = X[[dim, 'fyear', 'is_publishing']].groupby([dim, 'fyear']).mean().reset_index()
    disclosure_prob['mrg'] = disclosure_prob[dim]+'-'+disclosure_prob['fyear'].astype(int).astype(str)
    disclosure_prob = disclosure_prob.rename(columns = {'is_publishing': 'disclosure_prob'+dim})
    return disclosure_prob, dim  
def SelfSelection(cmp_for_ss):
    cmp = cmp_for_ss.copy()
    cmp['at_lagged'] = cmp[['at_usd', 'gvkey']].groupby('gvkey').shift(1)
    cmp['Profitability'] = cmp['ebitda_usd']/cmp['at_lagged']
    cmp['Tangibility'] = cmp['ppent_usd']/cmp['at_usd']
    cmp['investment'] = cmp['dltt_usd'] + cmp['dlc_usd']  + cmp['ceq_usd'] + cmp['mib_usd'].replace([np.nan], [0])
    cmp = cmp[cmp['investment'] > 0]
    cmp['investment'] = cmp['investment'].apply(np.log)
    cmp = cmp[cmp['Profitability'] > -np.inf]
    cmp = cmp[cmp['Profitability'] < np.inf]

    obs = pd.read_csv('local_data/refinitiv_csr_reporting_0423.csv')
    obs['CSR_Sustainability_Reporting'] = obs['CSR_Sustainability_Reporting'].replace(['0', '1', 'False', True, 'True'], [0,1,0,1, 1])
    obs['CSR_Sustainability_Reporting'] = obs['CSR_Sustainability_Reporting'].replace([np.nan], [0])
    obs = obs.loc[obs['fyear'].dropna().index].reset_index(drop = True)
    x = obs[['CSR_Sustainability_Reporting', 'gvkey']].groupby('gvkey').sum()
    referenceList = x[x!= 0].dropna().index
    obs.insert(0, 'is_publishing', [1 if obs.gvkey.iloc[i] in referenceList else 0 for i in range(len(obs))])
    
    obs['fyear'] = obs['fyear'].apply(lambda x: x.split('-')[0])
    obs = obs[obs.fyear > '2009']
    obs = obs[obs.fyear < '2022']
    obs['mrg'] = ut.utils().make_mrg(obs, 'fyear')
    obs = obs.groupby('mrg').last().reset_index()
    obs =obs.rename(columns = {'GICS_Sector_Name': 'GICS_level_1'})
    obs= obs[['gvkey', 'is_publishing', 'GICS_level_1', 'fyear', 'mrg']]
    left_over = pd.read_csv('local_data/LC_dataset_v_1_1H.csv')
    left_over = left_over[left_over.rfyear > 2009]
    left_over['mrg'] = ut.utils().make_mrg(left_over, 'rfyear')
    left_over = left_over[left_over.mrg.isin(obs.mrg) == False]
    left_over = left_over[['gvkey', 'GICS_level_1', 'rfyear', 'mrg']].rename(columns = {'rfyear': 'fyear'})
    left_over.insert(0, 'is_publishing', 1)

    obs = pd.concat((obs, left_over)).sort_values(by = ['gvkey', 'fyear'])
    obs = obs.groupby('mrg').last().reset_index()
    obs = obs.merge(cmp[['loc', 'mrg', 'GICS_level_2', 'GICS_level_3', 'ids']])
    CRM  = pd.read_csv('local_data/CountryToRegionMapping.csv')[['SP_GEOGRAPHY', 'iso_code']].groupby('iso_code').last().reset_index()
    obs = obs.merge(CRM[['SP_GEOGRAPHY', 'iso_code']].rename(columns = {'iso_code': 'loc'})).rename(columns = {'SP_GEOGRAPHY': 'macro_regions'})


    #=== Probability of disclosure
    disclosure_prob = []
    for dim in ['GICS_level_1', 'macro_regions']:
        dp_, dim = disclosure_probability(obs, dim)
        obs.loc[:,'mrg'] = obs[dim]+'-'+obs['fyear'].astype(int).astype(str)
        obs = obs.merge( dp_[['disclosure_prob'+dim, 'mrg']])
        disclosure_prob.append('disclosure_prob'+dim)
    obs['mrg'] = obs['gvkey'].astype(str)+'-'+obs['fyear'].astype(str)
    
    

    ds = cmp[cmp.gvkey.isin(obs.gvkey.unique())]
    ds = ds.merge(CRM[['SP_GEOGRAPHY', 'iso_code']].rename(columns = {'iso_code': 'loc'})).rename(columns = {'SP_GEOGRAPHY': 'macro_regions'})
    


    ds = ds[ds.fyear > 2009]
    ds = ds[ds.fyear < 2022]
    yer = pd.get_dummies(ds.fyear, drop_first = True).astype(int)
    ds = pd.concat((ds, yer), axis = 1)
    vars_ = ['Profitability', 'firm_size', 'investment', 'Tangibility']+list(yer.columns)

    L = ds[['gvkey', 'macro_regions', 'mrg'] + vars_].set_index('gvkey') 
    
    L = L.merge(obs.drop(columns = ['macro_regions']), on = 'mrg')
    L['mrg'] = L['gvkey'].astype(int).astype(str)+'-'+L['fyear'].astype(int).astype(str)

    s_dummies = pd.get_dummies(L['GICS_level_1'], drop_first = False)
    s_dummies = s_dummies.drop(columns = ['Financials']).astype(int)
    g_dummies = pd.get_dummies(L['macro_regions'], drop_first = False)
    g_dummies = g_dummies.drop(columns = ['Africa']).astype(int)
    #=========
    L = pd.concat((L, s_dummies, g_dummies), axis = 1).set_index('mrg')
    M = L[['is_publishing'] + disclosure_prob + vars_ + list(g_dummies.columns) + list(s_dummies.columns)].dropna()
    M = M[M<np.inf].dropna()
    M = M[M>-np.inf].dropna()
    M = M[M.Profitability > M.Profitability.quantile(0.001)]
    M = M[M.Profitability < M.Profitability.quantile(0.999)]
    M = M.groupby(M.index).last()
    X = M[M.columns[1:]]
    Y = M[M.columns[0]]

    
    prob_model = Probit(Y, X)
    res = prob_model.fit()
    estimated_prob = prob_model.predict(res.params, X)
    IMR = prob_model.pdf(scale(estimated_prob))/prob_model.cdf(scale(estimated_prob))
    
    MillsRatio = pd.DataFrame(IMR, index = X.index, columns = ['Mills_Ratio']).reset_index()
    
    hc_model = res.params*X.std()
    hc_model = [str(np.round(hc_model.loc[v], 2))+ut.utils().significance(res.pvalues.loc[v]) for v in res.params.loc[:'Tangibility'].index]
    hc_model = pd.DataFrame(hc_model, index = res.params.loc[:'Tangibility'].index)
    return MillsRatio, hc_model

